First we are goint to ge the date from the supercias ranking website
packages = ['seaborn', 'matplotlib', 'numpy', 'altair', 'pandas', 'numpy', 'os', 'pymongo', 'gridfs']
for package in packages:
try:
__import__(package)
except ImportError:
!pip install {package}
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import altair as alt
import os
# set the default renderer to vega
alt.data_transformers.enable("vegafusion")
alt.renderers.enable('default')
# Set display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format
#check if the following files are in the Downloads folder
domain = 'https://appscvsmovil.supercias.gob.ec/ranking/recursos/'
# the user should change the folder path to the one where the files are located
folder = '/home/' + os.environ['USER'] + '/Downloads/'
ranking_path = folder + 'bi_ranking.csv' if 'bi_ranking.csv' in os.listdir(folder) else domain + 'bi_ranking.csv'
ids_path = folder + 'bi_compania.csv' if 'bi_compania.csv' in os.listdir(folder) else domain + 'bi_compania.csv'
segmentos_path = folder + 'bi_segmento.csv' if 'bi_segmento.csv' in os.listdir(folder) else domain + 'bi_segmento.csv'
ciiu_path = folder + 'bi_ciiu.csv' if 'bi_ciiu.csv' in os.listdir(folder) else domain + 'bi_ciiu.csv'
# read from a csv file into a pd dataframe
df_ranking = pd.read_csv(ranking_path, low_memory=False)
# the companias string
df_ids = pd.read_csv(ids_path, low_memory=False)
# segementos
df_segmentos = pd.read_csv(segmentos_path, low_memory=False)
# Código de Clasificacón Industrial Internacional Unifrome
df_ciiu = pd.read_csv(ciiu_path, low_memory=False)
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[1], line 32
29 ciiu_path = folder + 'bi_ciiu.csv' if 'bi_ciiu.csv' in os.listdir(folder) else domain + 'bi_ciiu.csv'
31 # read from a csv file into a pd dataframe
---> 32 df_ranking = pd.read_csv(ranking_path, low_memory=False)
33 # the companias string
34 df_ids = pd.read_csv(ids_path, low_memory=False)
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/.local/lib/python3.12/site-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
725 codecs.lookup_error(errors)
727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
729 path_or_buf,
730 encoding=encoding,
731 compression=compression,
732 mode=mode,
733 storage_options=storage_options,
734 )
736 handle = ioargs.filepath_or_buffer
737 handles: list[BaseBuffer]
File ~/.local/lib/python3.12/site-packages/pandas/io/common.py:389, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
386 if content_encoding == "gzip":
387 # Override compression based on Content-Encoding header
388 compression = {"method": "gzip"}
--> 389 reader = BytesIO(req.read())
390 return IOArgs(
391 filepath_or_buffer=reader,
392 encoding=encoding,
(...)
395 mode=fsspec_mode,
396 )
398 if is_fsspec_url(filepath_or_buffer):
File /usr/lib64/python3.12/http/client.py:495, in HTTPResponse.read(self, amt)
493 else:
494 try:
--> 495 s = self._safe_read(self.length)
496 except IncompleteRead:
497 self._close_conn()
File /usr/lib64/python3.12/http/client.py:640, in HTTPResponse._safe_read(self, amt)
633 def _safe_read(self, amt):
634 """Read the number of bytes requested.
635
636 This function should be used when <amt> bytes "should" be present for
637 reading. If the bytes are truly not available (due to EOF), then the
638 IncompleteRead exception can be used to detect the problem.
639 """
--> 640 data = self.fp.read(amt)
641 if len(data) < amt:
642 raise IncompleteRead(data, amt-len(data))
File /usr/lib64/python3.12/socket.py:708, in SocketIO.readinto(self, b)
706 while True:
707 try:
--> 708 return self._sock.recv_into(b)
709 except timeout:
710 self._timeout_occurred = True
File /usr/lib64/python3.12/ssl.py:1252, in SSLSocket.recv_into(self, buffer, nbytes, flags)
1248 if flags != 0:
1249 raise ValueError(
1250 "non-zero flags not allowed in calls to recv_into() on %s" %
1251 self.__class__)
-> 1252 return self.read(nbytes, buffer)
1253 else:
1254 return super().recv_into(buffer, nbytes, flags)
File /usr/lib64/python3.12/ssl.py:1104, in SSLSocket.read(self, len, buffer)
1102 try:
1103 if buffer is not None:
-> 1104 return self._sslobj.read(len, buffer)
1105 else:
1106 return self._sslobj.read(len)
KeyboardInterrupt:
We also want to read from the the mondo BD and get all fo teh documets from each company
from pymongo import MongoClient
from gridfs import GridFS
# Connect to MongoDB on local host with ip address
endpoint = 'mongodb://127.0.0.1:27017'
db = MongoClient(endpoint)['supercias_ranking']
#companies = db['companies_suggestions']
# check the that here is a mongodb collection
#print(companies.count_documents({}))
Let’s only get the year 2023 and make the passivos by subtracting the patrimonio from the activos#
let’s merge all of the df into one#
# get the anio with 2023
df_ranking = df_ranking[df_ranking['anio'] == 2023]
# rename the cuii column
df_ciiu = df_ciiu.rename(columns={'descripcion': 'ciiu_desc', 'ciiu': 'ciiu_code'})
df_ciiu['ciiu_code'] = df_ciiu['ciiu_code'].str.strip()
# match all of the expedientes in the df_ids with the expedientes in the df_ranking
df = pd.merge(df_ranking, df_ids, on='expediente', how='left')
# let's merge the ciiu and the segments with the
df = pd.merge(df, df_ciiu, left_on='ciiu_n1', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n1_code', 'ciiu_desc': 'ciiu_n1_desc'}, inplace=True)
df = pd.merge(df, df_ciiu, left_on='ciiu_n6', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n6_code', 'ciiu_desc': 'ciiu_n6_desc'}, inplace=True)
# get the passivos by subtracting patrimonio from activos
df['pasivos'] = df['activos'] - df['patrimonio']
# check that there are error where activos is not equal to passivos + patrimonio
#df_error = df[df['activos'] != df['pasivos'] + df['patrimonio']]
# get the name and the passivos columns only
#df_error = df_error[['nombre', 'activos', 'pasivos', 'patrimonio']]
#df_error['margin'] = df_error['activos'] - df_error['pasivos'] - df_error['patrimonio']
# df print nu
#df_error.head()
#print(df.columns)
let seperate the companies into categoris by the number of activos#
# make companies categories between 0 and 2.5 billion in assets
# small companies: 0 - 500 thousand
# medium companies: 500 thousand - 50 million
# large companies: 50 million - up
df_small = df[df['activos'] < 500000]
df_medium = df[(df['activos'] >= 500000) & (df['activos'] < 50000000)]
df_large = df[(df['activos'] >= 50000000)]
# get the number of companies in each category and make a pie chart
pie_df = pd.DataFrame({
'size': ['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
'count': [df_small.shape[0], df_medium.shape[0], df_large.shape[0]]
})
pie_df['percentage'] = (pie_df['count'] / pie_df['count'].sum()) * 100
# Define custom colors for each category using named colors
color_scale = alt.Scale(domain=['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50m- up'],
range=['steelblue', 'orange', 'mediumseagreen' ])
pie_df.head()
pie_chart = alt.Chart(pie_df).mark_arc().encode(
theta=alt.Theta(field="count", type="quantitative"),
color=alt.Color(field="size", type="nominal", scale=color_scale),
tooltip=['size', 'count', alt.Tooltip('percentage:Q', format='.1f', title='percentage')]
).interactive()
pie_chart.display()
type_counts = df['tipo'].value_counts().reset_index()
type_counts.columns = ['tipo', 'count']
type_counts['percentage'] = (type_counts['count'] / type_counts['count'].sum()) * 100
alt.Chart(type_counts).mark_arc().encode(
theta=alt.Theta(field="count", type="quantitative"),
color=alt.Color(field="tipo", type="nominal"),
tooltip=['tipo', 'count', alt.Tooltip('percentage:Q', format='.1f',title='percentage')]
# show percentge in legentd
).interactive().display()
def make_chart(df, title='Activos de Companias', max_bins=100, color='steelblue', width=450, height=300):
# Create a selection
click = alt.selection_point(encodings=['x'])
# Create the right bar chart
bars = alt.Chart(df).mark_bar().encode(
x=alt.X('activos:Q', bin=alt.Bin(maxbins=max_bins), title='Activos'),
y=alt.Y( 'count()', title='Numero de companias'),
color=alt.condition(click, alt.ColorValue(color), alt.ColorValue(color)),
tooltip=['count():Q']
).add_params(
click
).properties(
width=width,
height=height,
title=title,
).interactive()
# Create the left chart (scatter plot)
scatter = alt.Chart(df).mark_circle().encode(
x='pasivos:Q',
y='activos:Q',
color='ciiu_n1_desc:N',
tooltip=['nombre:N', 'activos:Q', 'pasivos:Q', 'patrimonio:Q', 'ciiu_n1_desc:N']
).transform_filter(
click
).properties(
width=width,
height=height,
title=''
).interactive()
# Filter the left chart based on selection
filtered_scatter = scatter.transform_filter(
click
)
# Combine the charts
chart = alt.hconcat(bars, filtered_scatter).resolve_legend(
color="independent",
)
chart.show()
make_chart(df_small, title='Activos de Companias pequenas', color='steelblue')